Let’s load the tidyverse package.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## ── Conflicts ────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
Let’s import our data using read_csv. Note that the NHANES data is in the data directory so we need to include that.
nhanes <- read_csv("data/nhanes.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## id = col_double(),
## age = col_double(),
## weight = col_double(),
## height = col_double(),
## bmi = col_double(),
## days_phys_hlth_bad = col_double(),
## days_ment_hlth_bad = col_double(),
## sleep_hrs_night = col_double(),
## phys_active_days = col_double(),
## tv_hrs_day = col_logical()
## )
## See spec(...) for full column specifications.
Let’s see what our data looks like.
nhanes
With select we can select variables from the larger data frame.
nhanes %>%
select(age)
We can also use select for multiple variables:
nhanes %>%
select(age, education)
We can select a range of columns using the var1:var2 pattern
nhanes %>%
select(phys_active:smoke_now)
We can drop variables using the -var format:
nhanes %>%
select(-id)
We can drop a set of variables using the -(var1:var2) format:
nhanes %>%
select(-(id:age_decade))
We can drop a set of variables using the -(var1:var2) format. Drop the variables from health_gen to the end.
nhanes %>%
select(health_gen:last_col())
nhanes %>%
select(1:10)
nhanes %>%
select(contains("phys"))
We use mutate we make new variables or change existing ones.
We can use mutate in three ways:
Create a new variable with a specific value
# Make country variable
nhanes %>%
mutate(country = "United States") %>%
select(country)
Create a new variable based on other variables
nhanes %>%
mutate(height_inches = height / 2.54) %>%
select(height, height_inches)
Change an existing variable
nhanes %>%
mutate(height = height / 2.54) %>%
select(height)
We use filter to choose a subset of observations.
We use == to filter all observations that meet the criteria.
nhanes %>%
filter(gender == "female") %>%
select(gender)
We use != to select all observations that don’t meet the criteria.
nhanes %>%
filter(work != "Working") %>%
select(work)
We can use %in% to only include responses that much a group.
nhanes %>%
filter(health_gen %in% c("Good", "Vgood", "Excellent")) %>%
select(health_gen)
nhanes %>%
filter(health_gen == "Good" | health_gen == "Vgood" | health_gen == "Excellent") %>%
select(health_gen)
We can chain together multiple filter functions.
nhanes %>%
filter(gender == "male") %>%
filter(health_gen %in% c("Good", "Vgood", "Excellent")) %>%
select(gender, health_gen)
We can use <, >, <=, and => for numeric data.
nhanes %>%
filter(age >= 50) %>%
select(age)
nhanes %>%
filter(str_detect(education, "College"))
With summarize, we can go from a complete dataset down to a summary.
We use these functions with summarize.
Let’s calculate the mean number of days in which respondents report being physically active.
nhanes %>%
summarize(mean_days_phys_active = mean(phys_active_days))
This doesn’t work! Notice what the result is.
We need to add na.rm = TRUE to tell R to drop NA values.
nhanes %>%
summarize(mean_days_phys_active = mean(phys_active_days,
na.rm = TRUE))
nhanes %>%
drop_na(phys_active_days) %>%
summarize(mean_days_phys_active = mean(phys_active_days))
We can have multiple arguments in each usage of summarize. Let’s add the median along with the mean.
nhanes %>%
summarize(mean_days_phys_active = mean(phys_active_days,
na.rm = TRUE),
median_days_phys_active = median(phys_active_days,
na.rm = TRUE),
number_of_responses = n())
nhanes %>%
drop_na(phys_active_days) %>%
summarize(mean_days_phys_active = mean(phys_active_days,
na.rm = TRUE),
median_days_phys_active = median(phys_active_days,
na.rm = TRUE),
number_of_responses = n())
nhanes %>%
summarize(mean_hours_sleep = mean(sleep_hrs_night, na.rm =TRUE),
minimum_hours_sleep = min(sleep_hrs_night, na.rm =TRUE))
nhanes %>%
summarize(mean_hours_sleep = mean(sleep_hrs_night, na.rm =TRUE),
minimum_hours_sleep = min(sleep_hrs_night, na.rm =TRUE),
maximum_hours_sleep = max(sleep_hrs_night, na.rm =TRUE),
number_of_responses = n())
nhanes %>%
drop_na(sleep_hrs_night) %>%
summarize(mean_hours_sleep = mean(sleep_hrs_night),
max_hours_sleep = max(sleep_hrs_night))
summarize becomes truly powerful when paired with group_by, which enables us to perform calculations on each group. Let’s calculate the mean number of active days for each year the survey was done.
nhanes %>%
group_by(age_decade) %>%
summarize(mean_active_days = mean(phys_active_days,
na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
We can use group_by with multiple groups. Let’s calculate the mean number of active days for each year the survey was done and for gender.
nhanes %>%
group_by(survey_yr, gender, education) %>%
summarize(mean_active_days = mean(phys_active_days,
na.rm = TRUE))
## `summarise()` regrouping output by 'survey_yr', 'gender' (override with `.groups` argument)
nhanes %>%
filter(age_decade == "0-9") %>%
select(sleep_hrs_night)
nhanes %>%
drop_na(sleep_hrs_night) %>%
group_by(gender, age_decade) %>%
summarize(mean_sleep_hrs_night = mean(sleep_hrs_night,na.rm=TRUE)) %>%
mutate(mean_sleep_hrs_night_rounded = round(mean_sleep_hrs_night, digits = 1))
## `summarise()` regrouping output by 'gender' (override with `.groups` argument)
With arrange, we can reorder rows in a data frame based on the values of one or more variables. R arranges in ascending order by default.
nhanes %>%
arrange(age)
We can also arrange in descending order using desc().
nhanes %>%
arrange(desc(age))
We often use arrange at the end of chains to display things in order.
nhanes %>%
group_by(gender, age_decade) %>%
summarize(mean_active_days = mean(phys_active_days,
na.rm = TRUE)) %>%
drop_na(age_decade) %>%
arrange(desc(mean_active_days))
## `summarise()` regrouping output by 'gender' (override with `.groups` argument)
nhanes %>%
count(gender, age_decade)
Sometimes you want to save the results of your work to a new data frame.
Let’s say we want to get the average height in inches for women in various age groups.
This just displays the output:
nhanes %>%
filter(gender == "female") %>%
mutate(height_inches = height / 2.54) %>%
group_by(age_decade) %>%
summarize(mean_height_inches = mean(height_inches,
na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
This assigns that output to a new data frame.
nhanes <- read_csv("data/nhanes.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## id = col_double(),
## age = col_double(),
## weight = col_double(),
## height = col_double(),
## bmi = col_double(),
## days_phys_hlth_bad = col_double(),
## days_ment_hlth_bad = col_double(),
## sleep_hrs_night = col_double(),
## phys_active_days = col_double(),
## tv_hrs_day = col_logical()
## )
## See spec(...) for full column specifications.
female_height_by_age <- nhanes %>%
filter(gender == "female") %>%
mutate(height_inches = height / 2.54) %>%
group_by(age_decade) %>%
summarize(mean_height_inches = mean(height_inches,
na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
The tabyl function in the janitor package is mostly used for crosstabs, but you can use it to do frequencies.
nhanes %>%
tabyl(age_decade)
To do a crosstab, you just add another variable.
nhanes %>%
tabyl(age_decade, gender)
janitor has a set of functions that all start with adorn_ that add a number of things to our crosstabs. We call them after tabyl. For example, adorn_totals.
nhanes %>%
tabyl(age_decade, gender) %>%
adorn_totals(where = c("row", "col"))
We can add adorn_percentages to add percentages.
nhanes %>%
tabyl(age_decade, gender) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all")
We can then format these percentages using adorn_pct_formatting.
nhanes %>%
tabyl(age_decade, gender) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages() %>%
adorn_pct_formatting(digits = 0,
rounding = "half up")
If we want to include the n alongside percentages, we can use adorn_ns.
nhanes %>%
tabyl(age_decade, gender) %>%
adorn_totals(c("row", "col")) %>%
adorn_percentages() %>%
adorn_pct_formatting(digits = 0,
rounding = "half up") %>%
adorn_ns()
## Warning: `as_data_frame()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
We can add titles to our crosstabs using adorn_title.
nhanes %>%
tabyl(age_decade, gender) %>%
adorn_totals(c("row", "col")) %>%
adorn_percentages() %>%
adorn_pct_formatting() %>%
adorn_ns() %>%
adorn_title(placement = "combined")
We can also do three (or more) way crosstabs automatically by adding more variables to the tabyl function.
nhanes %>%
mutate(education = factor(education,
levels = c("8th Grade",
"9 - 11th Grade",
"High School",
"Some College",
"College Grad"))) %>%
tabyl(age_decade, gender, education) %>%
adorn_totals(c("row", "col")) %>%
adorn_percentages() %>%
adorn_pct_formatting() %>%
adorn_ns() %>%
adorn_title(placement = "combined")
## $`8th Grade`
## age_decade/gender female male Total
## 0-9 - (0) - (0) 100.0% (0)
## 10-19 - (0) - (0) 100.0% (0)
## 20-29 51.4% (19) 48.6% (18) 100.0% (37)
## 30-39 53.4% (39) 46.6% (34) 100.0% (73)
## 40-49 38.6% (34) 61.4% (54) 100.0% (88)
## 50-59 48.5% (32) 51.5% (34) 100.0% (66)
## 60-69 38.8% (26) 61.2% (41) 100.0% (67)
## 70+ 59.7% (37) 40.3% (25) 100.0% (62)
## <NA> 37.9% (22) 62.1% (36) 100.0% (58)
## Total 46.3% (209) 53.7% (242) 100.0% (451)
##
## $`9 - 11th Grade`
## age_decade/gender female male Total
## 0-9 - (0) - (0) 100.0% (0)
## 10-19 - (0) - (0) 100.0% (0)
## 20-29 41.9% (72) 58.1% (100) 100.0% (172)
## 30-39 43.0% (61) 57.0% (81) 100.0% (142)
## 40-49 35.7% (60) 64.3% (108) 100.0% (168)
## 50-59 45.7% (75) 54.3% (89) 100.0% (164)
## 60-69 51.5% (50) 48.5% (47) 100.0% (97)
## 70+ 54.2% (52) 45.8% (44) 100.0% (96)
## <NA> 65.3% (32) 34.7% (17) 100.0% (49)
## Total 45.3% (402) 54.7% (486) 100.0% (888)
##
## $`Some College`
## age_decade/gender female male Total
## 0-9 - (0) - (0) 100.0% (0)
## 10-19 - (0) - (0) 100.0% (0)
## 20-29 50.5% (271) 49.5% (266) 100.0% (537)
## 30-39 52.9% (239) 47.1% (213) 100.0% (452)
## 40-49 47.0% (167) 53.0% (188) 100.0% (355)
## 50-59 47.4% (183) 52.6% (203) 100.0% (386)
## 60-69 60.4% (177) 39.6% (116) 100.0% (293)
## 70+ 62.7% (104) 37.3% (62) 100.0% (166)
## <NA> 71.8% (56) 28.2% (22) 100.0% (78)
## Total 52.8% (1197) 47.2% (1070) 100.0% (2267)
##
## $`College Grad`
## age_decade/gender female male Total
## 0-9 - (0) - (0) 100.0% (0)
## 10-19 - (0) - (0) 100.0% (0)
## 20-29 55.6% (163) 44.4% (130) 100.0% (293)
## 30-39 52.1% (233) 47.9% (214) 100.0% (447)
## 40-49 58.6% (282) 41.4% (199) 100.0% (481)
## 50-59 52.0% (217) 48.0% (200) 100.0% (417)
## 60-69 42.9% (120) 57.1% (160) 100.0% (280)
## 70+ 47.1% (57) 52.9% (64) 100.0% (121)
## <NA> 45.8% (27) 54.2% (32) 100.0% (59)
## Total 52.4% (1099) 47.6% (999) 100.0% (2098)
##
## $NA_
## age_decade/gender female male Total
## 0-9 46.9% (653) 53.1% (738) 100.0% (1391)
## 10-19 49.8% (684) 50.2% (690) 100.0% (1374)
## 20-29 0.0% (0) 100.0% (4) 100.0% (4)
## 30-39 0.0% (0) 100.0% (2) 100.0% (2)
## 40-49 100.0% (3) 0.0% (0) 100.0% (3)
## 50-59 - (0) - (0) 100.0% (0)
## 60-69 100.0% (1) 0.0% (0) 100.0% (1)
## 70+ 100.0% (2) 0.0% (0) 100.0% (2)
## <NA> 0.0% (0) 100.0% (2) 100.0% (2)
## Total 48.3% (1343) 51.7% (1436) 100.0% (2779)
##
## $`High School`
## age_decade/gender female male Total
## 0-9 - (0) - (0) 100.0% (0)
## 10-19 - (0) - (0) 100.0% (0)
## 20-29 49.8% (156) 50.2% (157) 100.0% (313)
## 30-39 47.3% (105) 52.7% (117) 100.0% (222)
## 40-49 44.6% (135) 55.4% (168) 100.0% (303)
## 50-59 42.8% (116) 57.2% (155) 100.0% (271)
## 60-69 58.6% (106) 41.4% (75) 100.0% (181)
## 70+ 68.6% (96) 31.4% (44) 100.0% (140)
## <NA> 64.4% (56) 35.6% (31) 100.0% (87)
## Total 50.8% (770) 49.2% (747) 100.0% (1517)